Normalisescoring rates by standardising to score per 40 min (Kubatko, 2007) several models have been attempted (Spector 2020). difficult to predict how 5 opponents will perform WITH each otehr and also against an opposition
Add standardised team scoring rate per 40min period (Kubatko, 2007), and win%
This is a link to the data fields
Chicago Bulls payroll is 24th or the 30 NBA teams
# Barplot team payrolls
team_sal <- ggplot(data = df_team_payroll, aes(x = reorder(team, COL2), y = COL2)) +
geom_bar(stat="identity")
# Horizontal bar plot
new <- team_sal + coord_flip(ylim = c(50, 160))
#label axes
new + labs(title = "Team payroll (2018-19)", face = "italics", x = "Team", y = "Millions")
Win % versus team salary
df_team_WL <- mutate(df_team_WL, winP = (W / (W + L)* 100))
#add win% column
df_team_pts40 <- bind_cols(df_team_pts40, df_team_WL[-c(1:25)])
# team average pts per minute
team_pts40 <- ggplot(data = df_team_pts40, aes(x = reorder(Team, winP), y = winP)) +
geom_bar(stat="identity")
# Horizontal bar plot
new1 <- team_pts40 + coord_flip(ylim = c(20, 75)) +
labs(title = "Team game win percentages (2018-19)", x = "Team", y = "Win %")
new1
Chicago Bulls are ranked 29th of 30 in season average points scored per minute played
# team average pts per minute
ggteam_pts40 <- ggplot(data = df_team_pts40, aes(x = reorder(Team, team_pts40), y = team_pts40)) +
geom_bar(stat="identity")
# Horizontal bar plot
new2 <- ggteam_pts40 + coord_flip(ylim = c(17, 20)) +
labs(title = "Team average points scored per 40 minutes played (2018-19)", x = "Team", y = "Points / 40 Minutes")
new2
standardise variables to events per 40min
df_team_pts40 <- mutate(df_team_pts40, AST40 = (AST / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, TOV40 = (TOV / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, STL40 = (STL / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, BLK40 = (BLK / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, PF40 = (PF / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, TRB40 = (TRB / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, ORB40 = (ORB / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, DRB40 = (DRB / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, FG40 = (FG / MP)* 40)
df_team_pts40 <- mutate(df_team_pts40, FGA40 = (FGA / MP)* 40)
correlation wins and points scored
#points/40 vs win%
df_team_pts40 %>%
ggplot(aes(x = team_pts40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$team_pts40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.9569494
#assists vs win%
df_team_pts40 %>%
ggplot(aes(x = AST40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$AST40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.548208
#turnovers vs win%
df_team_pts40 %>%
ggplot(aes(x = TOV40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$TOV40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.1385349
#steals vs win%
df_team_pts40 %>%
ggplot(aes(x = STL40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$STL40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.2818981
#blocks vs win%
df_team_pts40 %>%
ggplot(aes(x = BLK40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$BLK40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.4495809
#fouls vs win%
df_team_pts40 %>%
ggplot(aes(x = PF40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$PF40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.2266698
#total rebounds vs win%
df_team_pts40 %>%
ggplot(aes(x = TRB40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$TRB40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.5195445
#ORB vs win%
df_team_pts40 %>%
ggplot(aes(x = ORB40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$ORB40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.2199931
#DRB vs win%
df_team_pts40 %>%
ggplot(aes(x = DRB40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$DRB40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.4901269
#FG vs win%
df_team_pts40 %>%
ggplot(aes(x = FG40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$FG40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.7850829
#FGA vs win%
df_team_pts40 %>%
ggplot(aes(x = FGA40,y = winP)) +
geom_point(colour = "dodgerblue") +
ylim(0, 100) +
geom_smooth(method = "lm", colour = "magenta") +
geom_hline(yintercept = 50, colour = "black", linetype = "dashed")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$FGA40, y = df_team_pts40$winP, method = "pearson")
## [1] 0.5700753
team stats related to pts40
#AST40 vs pts40
df_team_pts40 %>%
ggplot(aes(x = AST40,y = team_pts40)) +
geom_point(colour = "dodgerblue") +
geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$AST40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.5823599
#TOV40 vs pts40
df_team_pts40 %>%
ggplot(aes(x = TOV40,y = team_pts40)) +
geom_point(colour = "dodgerblue") +
geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$TOV40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.06309648
#STL40 vs pts40
df_team_pts40 %>%
ggplot(aes(x = STL40,y = team_pts40)) +
geom_point(colour = "dodgerblue") +
geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$STL40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.1702733
#BLK40 vs pts40
df_team_pts40 %>%
ggplot(aes(x = BLK40,y = team_pts40)) +
geom_point(colour = "dodgerblue") +
geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$BLK40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.4192942
#TRB40 vs pts40
df_team_pts40 %>%
ggplot(aes(x = TRB40,y = team_pts40)) +
geom_point(colour = "dodgerblue") +
geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$TRB40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.5844159
#DRB vs pts40
df_team_pts40 %>%
ggplot(aes(x = DRB40,y = team_pts40)) +
geom_point(colour = "dodgerblue") +
geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$DRB40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.5767078
#ORB vs pts40
df_team_pts40 %>%
ggplot(aes(x = ORB40,y = team_pts40)) +
geom_point(colour = "dodgerblue") +
geom_smooth(method = "lm", colour = "magenta")
## `geom_smooth()` using formula 'y ~ x'
cor(x = df_team_pts40$ORB40, y = df_team_pts40$team_pts40, method = "pearson")
## [1] 0.2002259
** choose variables associated with high scoring ** 3 explanatory variables associated with win% and total score 40 are: Points Scored Assists Blocks Total rebounds
Combine player statistics and player salary data files by ‘player names’
# join df_sal & df_pl_stat by "player_name"
df_players <- left_join(x = df_sal, y = df_players, by = c("player_name"))
write.csv(df_players, file = 'data_processed/2018-19_nba_player-salaries-stats_.csv', row.names = FALSE)
Clean processed data file
sum(is.na(df_players))
## [1] 1901
naniar::vis_miss(df_players)
#filter out players with no position or game time
df_pl_clean <- drop_na(df_players, Pos)
naniar::vis_miss(df_pl_clean)
Create normalised data by adding variable “points per minute played” Explore the data filter out games less than 10 to minimise outliers
#ggplot(data = df_players) +
# geom_histogram(mapping = aes(x = salmil), colour = "black", fill = "dodgerblue") +
# labs(x = "millions", y = "number of players", title = "Player salary distribution per year")
df_players <- mutate(df_players, pts40 = (PTS / MP)* 40)
df_players %>%
group_by(player_id) %>%
ggplot() +
geom_histogram(mapping = aes(x = pts40), colour = "black", fill = "dodgerblue") +
labs(x = "points per minutes played", y = "number of players", title = "Distribution of points scored per minute played", subtitle = "(games played > 10)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 64 rows containing non-finite values (stat_bin).
df_players %>%
group_by(player_id) %>%
filter(G > 10) %>%
ggplot() +
geom_histogram(mapping = aes(x = pts40), colour = "black", fill = "dodgerblue") +
labs(x = "points per minutes played", y = "number of players", title = "Distribution of points scored per minute played", subtitle = "(games played > 10)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
merge rows, add column sal.mil
df_players <- mutate(df_players, salmil = salary / 100000)
#combine rows with same player names
df_players <- bind_rows(df_players) %>%
group_by(player_id, player_name, Pos) %>%
summarise(salmil = mean(salmil, na.rm = TRUE),
G = sum(G, na.rm = TRUE),
MP = sum(MP, na.rm = TRUE),
TRB = sum(TRB, na.rm = TRUE),
AST = sum(AST, na.rm = TRUE),
BLK = sum(BLK, na.rm = TRUE),
PTS = sum(PTS, na.rm = TRUE))
## `summarise()` has grouped output by 'player_id', 'player_name'. You can override using the `.groups` argument.
df_players40 <- bind_rows(df_players) %>%
group_by(player_id, player_name) %>%
summarise(salmil = mean(salmil, na.rm = TRUE),
G = sum(G, na.rm = TRUE),
MP = sum(MP, na.rm = TRUE),
TRB = sum(TRB, na.rm = TRUE),
AST = sum(AST, na.rm = TRUE),
BLK = sum(BLK, na.rm = TRUE),
PTS = sum(PTS, na.rm = TRUE))
## `summarise()` has grouped output by 'player_id'. You can override using the `.groups` argument.
df_players <- mutate(df_players, pts40_ind = (PTS / MP)* 40)
df_players <- mutate(df_players, AST40 = (AST / MP)* 40)
df_players <- mutate(df_players, BLK40 = (BLK / MP)* 40)
df_players <- mutate(df_players, TRB40 = (TRB / MP)* 40)
df_players40 <- mutate(df_players40, pts40_ind = (PTS / MP)* 40)
df_players40 <- mutate(df_players40, AST40 = (AST / MP)* 40)
df_players40 <- mutate(df_players40, BLK40 = (BLK / MP)* 40)
df_players40 <- mutate(df_players40, TRB40 = (TRB / MP)* 40)
look for relationships b/t Player positions and Explanatory Variables
#player group by pts_ind
df_players %>%
filter(G > 10) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(Pos, pts40_ind, FUN = median),
y = pts40_ind, colour = reorder(Pos, pts40_ind,
FUN = median))) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45))
#player group by AST
df_players %>%
filter(G > 10) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(Pos, AST40, FUN = median),
y = AST40, colour = reorder(Pos, AST40,
FUN = median))) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45))
#player group by BLK
df_players %>%
filter(G > 10) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(Pos, BLK40, FUN = median),
y = BLK40, colour = reorder(Pos, BLK40,
FUN = median))) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45))
#player group by TRB
df_players %>%
filter(G > 10) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(Pos, TRB40, FUN = median),
y = TRB40, colour = reorder(Pos, TRB40,
FUN = median))) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45))
fit <- lm(winP ~ AST40 + BLK40 + TRB40, data = df_team_pts40)
tidy(fit, conf.int = TRUE)
## # A tibble: 4 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -125. 47.1 -2.65 0.0134 -222. -28.1
## 2 AST40 14.1 7.81 1.80 0.0829 -1.97 30.1
## 3 BLK40 17.3 22.1 0.781 0.442 -28.2 62.8
## 4 TRB40 13.8 6.80 2.03 0.0523 -0.149 27.8
** Test Assumptions ** 1. Response variable (team win %) IS continuous 2. Explanatory variables (assists, blocks & total rebounds are continuous integers) 3. Independence: potentially fails due to repeated measures of positions
car::durbinWatsonTest(fit)
## lag Autocorrelation D-W Statistic p-value
## 1 0.4891626 0.9163006 0
## Alternative hypothesis: rho != 0
car::avPlots(fit)
std_res <- rstandard(fit)
points <- 1:length(std_res)
res_labels <- if_else(abs(std_res) >= 2.5, paste(points), "")
ggplot(data = NULL, aes(x = points, y = std_res)) + geom_point() +
geom_text(aes(label = res_labels), nudge_y = 0.3) + ylim(c(-4,4)) +
geom_hline(yintercept = c(-2.5, 2.5), colour = "red", linetype = "dashed")
lm +
geom_text(aes(label = res_labels), nudge_x = 0.002)
## NULL
cook <- cooks.distance(fit)
cook_labels <- if_else(cook >= 0.015, paste(points), "")
ggplot(data = NULL, aes(x = points, y = cook)) + geom_point() +
geom_text(aes(label = cook_labels), nudge_y = 0.01)
6. Homoscedasticity
res <- residuals(fit)
fitted <- predict(fit)
ggplot(data = NULL, aes(x = fitted, y = res)) +
geom_point(colour = "dodgerblue") +
geom_smooth(se = FALSE, colour = "magenta")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
7. Normally distributed residuals
ggplot(data = NULL, aes(sample = res)) + stat_qq() + stat_qq_line()
#team
pairs(formula = ~ winP + AST40 + BLK40 + TRB40, data = df_team_pts40)
fit <- lm(winP ~ AST40 + BLK40 + TRB40, data = df_team_pts40)
tidy(fit, conf.int = TRUE)
## # A tibble: 4 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -125. 47.1 -2.65 0.0134 -222. -28.1
## 2 AST40 14.1 7.81 1.80 0.0829 -1.97 30.1
## 3 BLK40 17.3 22.1 0.781 0.442 -28.2 62.8
## 4 TRB40 13.8 6.80 2.03 0.0523 -0.149 27.8
sqrt(car::vif(fit))
## AST40 BLK40 TRB40
## 1.247104 1.225111 1.107027
head(predict(fit))
## 1 2 3 4 5 6
## 66.97234 67.84684 62.32325 62.36924 48.60328 52.65836
#TRB
df_players %>%
filter(G > 10, Pos %in% c("C", "PF", "PG", "SF", "SG")) %>%
ggplot() +
geom_histogram(aes(x = TRB40, fill = Pos), colour = "black", bins = 40) + facet_wrap(~Pos, nrow = 5)
#assist
df_players %>%
filter(G > 10, Pos %in% c("C", "PF", "PG", "SF", "SG")) %>%
ggplot() +
geom_histogram(aes(x = AST40, fill = Pos), colour = "black", bins = 40) + facet_wrap(~Pos, nrow = 5)
#pts40
df_players %>%
filter(G > 10, Pos %in% c("C", "PF", "PG", "SF", "SG")) %>%
ggplot() +
geom_histogram(aes(x = pts40_ind, fill = Pos), colour = "black", bins = 40) + facet_wrap(~Pos, nrow = 5)
# create panels by match_outcome
df_players %>%
filter(G > 10) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(Pos, AST40, FUN = median),
y = AST40, colour = reorder(Pos, AST40,
FUN = median))) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45))
#salary by points per minutes played
df_players40 %>% filter(G > 10, PTS > 0) %>% group_by(player_id) %>%
ggplot(mapping = aes(x = salmil, y = pts40_ind)) +
geom_point()
#salary by points per minutes played
df_players %>% filter(G > 10) %>% group_by(player_id) %>%
ggplot(mapping = aes(x = salmil, y = pts40_ind, colour = Pos)) +
geom_point()
#player group by salmil
df_players %>%
filter(G > 10) %>%
ggplot() +
geom_boxplot(mapping = aes(x = reorder(Pos, salmil, FUN = median),
y = salmil, colour = reorder(Pos, salmil,
FUN = median))) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45))
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.